#Likely Vote Models ONLY
#MAIN TEXT - Aug 29

if (!require(pacman)) install.packages("pacman")

pacman::p_load(tidyverse, # tidyverse 
               plm, # panel data analysis
               pglm, # generalized panel data analysis
               clubSandwich, # robust SEs
               here, # computational reproducibility 
               glue, # gluing objects and strings 
               tidylog, # logging analysis
               naniar, # missing data 
               zeallot, # multiple assignments 
               readxl, 
               ggpubr,
               hrbrthemes, 
               wesanderson,
               broom, 
               sjPlot, 
               jtools, 
               patchwork, 
               broom.mixed, 
               estimatr, 
               stargazer,
               DeclareDesign,
               sensemakr,
               mice,
               usmap,
               janitor,
               modelsummary, 
               flextable,
               officer,
               extrafont,
               gt,
               bootstrap,
               lme4,
               clipr,
               infer)

source(here("functions/utils.r"))
source(here("functions/theme.R"))

ggplot2::theme_set(theme_bw())

# no scientific notation
options(scipen = 999)

#load files
df <- read_csv(here("panel_data_jul24_2.csv"))

#mark non-registered as NA
table(df$votereg, df$wave)
df$nonvoter <- NA 
df$nonvoter <- ifelse(df$votereg==1,1, NA)
table(df$nonvoter, df$wave)

## remove non-voters marked as NA, keep wave 2 because question wasn't asked
to_remove <- NA 
to_remove <- df %>%
  filter(is.na(df$nonvoter) & wave != 2) %>%
  select(pid)

###
table(df$votereg, df$wave)
table(df$nonvoter, df$wave)

##
df %>%
  filter(wave != 1) %>%
  select(gendiscrim, apa.discrim.rona, usborn, edu, DEM, GOP, age, male, natorigin, biden, `X2020likelyvote`) %>%
  vis_miss()

df$likely_vote <- df$X2020likelyvote

# create a proxy variable
df$proxy <- rep(subset(df, wave == 1)$gendiscrim, 3)

## check the proxy data distribution
df$proxy %>% table()

df$prior[df$proxy == 0.5] <- "Middle"
df$prior[df$proxy < 0.5] <- "Low"
df$prior[df$proxy > 0.5] <- "High"

min(df$proxy) ; max(df$proxy)

df$usborn <- factor(df$usborn)
df$male <- factor(df$male)
df$chinese <- factor(df$chinese)
df$indian <- factor(df$indian)
df$GOP <- factor(df$GOP)
df$DEM <- factor(df$DEM)
df$wave <- factor(df$wave)
df$nat_origin <- factor(df$natorigin)

model.outs <- cal_model_outputs(df)
model.outs.low <- cal_model_outputs(df %>% filter(prior == "Low"))
model.outs.middle <- cal_model_outputs(df %>% filter(prior == "Middle"))
model.outs.high <- cal_model_outputs(df %>% filter(prior == "High"))

cm <- c("apa.discrim.rona" = "COVID Discrim",
        "gendiscrim" = "General Discrim (Post COVID)",
        "usborn1" = "US Born",
        "edu" = "Education",
        "DEM1" = "Democratic Party",
        "GOP1" = "Republican Party",
        "age" = "Age",
        "male1" = "Male",
        "wave3" = "Wave3",
        "proxy" = "Wave 1 General Discrim",
        "apa.discrim.rona:proxy" = "COVID Discrim* Pre-COVID General Discrim",
        "chinese1" = "Chinese",
        "indian1" = "Indian")


##
save <- df


## Table 3
turnout_mod <- list(
  
  "OLS" = lm(likely_vote ~ gendiscrim + apa.discrim.rona + gendiscrim + usborn + edu + DEM + GOP + age + male + wave + nat_origin, data = df),
  
  "Weighted" = lm(likely_vote ~ gendiscrim + apa.discrim.rona + gendiscrim + usborn + edu + DEM + GOP + age + male + wave + nat_origin, data = df, weight = weights),
  
  "Fixed Effects" = plm(likely_vote ~ gendiscrim + apa.discrim.rona, data = df, index = c("pid", "wave"), model = "within"),
  
  "Interaction term" = lm(likely_vote ~ apa.discrim.rona + gendiscrim + usborn + edu + DEM + GOP + age + male + wave + proxy + apa.discrim.rona:proxy + nat_origin, data = df)
  
)

suppressWarnings({

modelsummary(turnout_mod,
             fmt = 3,
             coef_map = cm,
             coef_omit = "Intercept|nat_origin",
             statistic = c("p = {p.value}"),
             output = here("outputs", "table3.docx"),
             vcov = "robust")

})

table(df$wave) #number of participants